{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# FeatureHasher and DictVectorizer Comparison\n\n\nCompares FeatureHasher and DictVectorizer by using both to vectorize\ntext documents.\n\nThe example demonstrates syntax and speed only; it doesn't actually do\nanything useful with the extracted vectors. See the example scripts\n{document_classification_20newsgroups,clustering}.py for actual learning\non text documents.\n\nA discrepancy between the number of terms reported for DictVectorizer and\nfor FeatureHasher is to be expected due to hash collisions.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Author: Lars Buitinck\n# License: BSD 3 clause\nfrom collections import defaultdict\nimport re\nimport sys\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction import DictVectorizer, FeatureHasher\n\n\ndef n_nonzero_columns(X):\n    \"\"\"Returns the number of non-zero columns in a CSR matrix X.\"\"\"\n    return len(np.unique(X.nonzero()[1]))\n\n\ndef tokens(doc):\n    \"\"\"Extract tokens from doc.\n\n    This uses a simple regex to break strings into tokens. For a more\n    principled approach, see CountVectorizer or TfidfVectorizer.\n    \"\"\"\n    return (tok.lower() for tok in re.findall(r\"\\w+\", doc))\n\n\ndef token_freqs(doc):\n    \"\"\"Extract a dict mapping tokens from doc to their frequencies.\"\"\"\n    freq = defaultdict(int)\n    for tok in tokens(doc):\n        freq[tok] += 1\n    return freq\n\n\ncategories = [\n    'alt.atheism',\n    'comp.graphics',\n    'comp.sys.ibm.pc.hardware',\n    'misc.forsale',\n    'rec.autos',\n    'sci.space',\n    'talk.religion.misc',\n]\n# Uncomment the following line to use a larger set (11k+ documents)\n# categories = None\n\nprint(__doc__)\nprint(\"Usage: %s [n_features_for_hashing]\" % sys.argv[0])\nprint(\"    The default number of features is 2**18.\")\nprint()\n\ntry:\n    n_features = int(sys.argv[1])\nexcept IndexError:\n    n_features = 2 ** 18\nexcept ValueError:\n    print(\"not a valid number of features: %r\" % sys.argv[1])\n    sys.exit(1)\n\n\nprint(\"Loading 20 newsgroups training data\")\nraw_data, _ = fetch_20newsgroups(subset='train', categories=categories,\n                                 return_X_y=True)\ndata_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6\nprint(\"%d documents - %0.3fMB\" % (len(raw_data), data_size_mb))\nprint()\n\nprint(\"DictVectorizer\")\nt0 = time()\nvectorizer = DictVectorizer()\nvectorizer.fit_transform(token_freqs(d) for d in raw_data)\nduration = time() - t0\nprint(\"done in %fs at %0.3fMB/s\" % (duration, data_size_mb / duration))\nprint(\"Found %d unique terms\" % len(vectorizer.get_feature_names()))\nprint()\n\nprint(\"FeatureHasher on frequency dicts\")\nt0 = time()\nhasher = FeatureHasher(n_features=n_features)\nX = hasher.transform(token_freqs(d) for d in raw_data)\nduration = time() - t0\nprint(\"done in %fs at %0.3fMB/s\" % (duration, data_size_mb / duration))\nprint(\"Found %d unique terms\" % n_nonzero_columns(X))\nprint()\n\nprint(\"FeatureHasher on raw tokens\")\nt0 = time()\nhasher = FeatureHasher(n_features=n_features, input_type=\"string\")\nX = hasher.transform(tokens(d) for d in raw_data)\nduration = time() - t0\nprint(\"done in %fs at %0.3fMB/s\" % (duration, data_size_mb / duration))\nprint(\"Found %d unique terms\" % n_nonzero_columns(X))"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}